home *** CD-ROM | disk | FTP | other *** search
/ Enter 2006 September / Enter 09 2006.iso / Internet / SpamExperts Home 1.1 / SpamExperts Home.exe / lib / spamexperts.modules / spambayes / classifier.pyc (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2006-07-14  |  19.1 KB  |  626 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.4)
  3.  
  4. from __future__ import generators
  5. import math
  6.  
  7. try:
  8.     Set = set
  9. except NameError:
  10.     
  11.     try:
  12.         from sets import Set
  13.     except ImportError:
  14.         from spambayes.compatsets import Set
  15.  
  16.  
  17. import re
  18. import os
  19. import sys
  20. import socket
  21. import pickle
  22. import urllib2
  23. from email import message_from_string
  24.  
  25. try:
  26.     enumerate
  27. except NameError:
  28.     
  29.     def enumerate(seq):
  30.         i = 0
  31.         for elt in seq:
  32.             yield (i, elt)
  33.             i += 1
  34.         
  35.  
  36.  
  37. DOMAIN_AND_PORT_RE = re.compile('([^:/\\\\]+)(:([\\d]+))?')
  38. HTTP_ERROR_RE = re.compile('HTTP Error ([\\d]+)')
  39. URL_KEY_RE = re.compile('[\\W]')
  40. from spambayes.Options import options
  41. from spambayes.chi2 import chi2Q
  42.  
  43. try:
  44.     (True, False)
  45. except NameError:
  46.     (True, False) = (1, 0)
  47.  
  48. LN2 = math.log(2)
  49. slurp_wordstream = None
  50. PICKLE_VERSION = 5
  51.  
  52. class WordInfo(object):
  53.     __slots__ = ('spamcount', 'hamcount')
  54.     
  55.     def __init__(self):
  56.         self.__setstate__((0, 0))
  57.  
  58.     
  59.     def __repr__(self):
  60.         return 'WordInfo' + repr((self.spamcount, self.hamcount))
  61.  
  62.     
  63.     def __getstate__(self):
  64.         return (self.spamcount, self.hamcount)
  65.  
  66.     
  67.     def __setstate__(self, t):
  68.         (self.spamcount, self.hamcount) = t
  69.  
  70.  
  71.  
  72. class Classifier:
  73.     WordInfoClass = WordInfo
  74.     
  75.     def __init__(self):
  76.         self.wordinfo = { }
  77.         self.probcache = { }
  78.         self.nspam = self.nham = 0
  79.  
  80.     
  81.     def __getstate__(self):
  82.         return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham)
  83.  
  84.     
  85.     def __setstate__(self, t):
  86.         if t[0] != PICKLE_VERSION:
  87.             raise ValueError("Can't unpickle -- version %s unknown" % t[0])
  88.         
  89.         (self.wordinfo, self.nspam, self.nham) = t[1:]
  90.         self.probcache = { }
  91.  
  92.     
  93.     def chi2_spamprob(self, wordstream, evidence = False):
  94.         '''Return best-guess probability that wordstream is spam.
  95.  
  96.         wordstream is an iterable object producing words.
  97.         The return value is a float in [0.0, 1.0].
  98.  
  99.         If optional arg evidence is True, the return value is a pair
  100.             probability, evidence
  101.         where evidence is a list of (word, probability) pairs.
  102.         '''
  103.         frexp = frexp
  104.         ln = log
  105.         import math
  106.         H = S = 1.0
  107.         Hexp = Sexp = 0
  108.         clues = self._getclues(wordstream)
  109.         for prob, word, record in clues:
  110.             S *= 1.0 - prob
  111.             H *= prob
  112.             if S < 9.9999999999999998e-201:
  113.                 (S, e) = frexp(S)
  114.                 Sexp += e
  115.             
  116.             if H < 9.9999999999999998e-201:
  117.                 (H, e) = frexp(H)
  118.                 Hexp += e
  119.                 continue
  120.         
  121.         S = ln(S) + Sexp * LN2
  122.         H = ln(H) + Hexp * LN2
  123.         n = len(clues)
  124.         if n:
  125.             S = 1.0 - chi2Q(-2.0 * S, 2 * n)
  126.             H = 1.0 - chi2Q(-2.0 * H, 2 * n)
  127.             prob = ((S - H) + 1.0) / 2.0
  128.         else:
  129.             prob = 0.5
  130.  
  131.     
  132.     def slurping_spamprob(self, wordstream, evidence = False):
  133.         '''Do the standard chi-squared spamprob, but if the evidence
  134.         leaves the score in the unsure range, and we have fewer tokens
  135.         than max_discriminators, also generate tokens from the text
  136.         obtained by following http URLs in the message.'''
  137.         h_cut = options[('Categorization', 'ham_cutoff')]
  138.         s_cut = options[('Categorization', 'spam_cutoff')]
  139.         (prob, clues) = self.chi2_spamprob(wordstream, True)
  140.         if evidence:
  141.             return (prob, clues)
  142.         
  143.         return prob
  144.  
  145.     if options[('Classifier', 'use_chi_squared_combining')]:
  146.         if options[('URLRetriever', 'x-slurp_urls')]:
  147.             spamprob = slurping_spamprob
  148.         else:
  149.             spamprob = chi2_spamprob
  150.     
  151.     
  152.     def learn(self, wordstream, is_spam):
  153.         """Teach the classifier by example.
  154.  
  155.         wordstream is a word stream representing a message.  If is_spam is
  156.         True, you're telling the classifier this message is definitely spam,
  157.         else that it's definitely not spam.
  158.         """
  159.         if options[('Classifier', 'use_bigrams')]:
  160.             wordstream = self._enhance_wordstream(wordstream)
  161.         
  162.         if options[('URLRetriever', 'x-slurp_urls')]:
  163.             wordstream = self._add_slurped(wordstream)
  164.         
  165.         self._add_msg(wordstream, is_spam)
  166.  
  167.     
  168.     def unlearn(self, wordstream, is_spam):
  169.         '''In case of pilot error, call unlearn ASAP after screwing up.
  170.  
  171.         Pass the same arguments you passed to learn().
  172.         '''
  173.         if options[('Classifier', 'use_bigrams')]:
  174.             wordstream = self._enhance_wordstream(wordstream)
  175.         
  176.         if options[('URLRetriever', 'x-slurp_urls')]:
  177.             wordstream = self._add_slurped(wordstream)
  178.         
  179.         self._remove_msg(wordstream, is_spam)
  180.  
  181.     
  182.     def probability(self, record):
  183.         '''Compute, store, and return prob(msg is spam | msg contains word).
  184.  
  185.         This is the Graham calculation, but stripped of biases, and
  186.         stripped of clamping into 0.01 thru 0.99.  The Bayesian
  187.         adjustment following keeps them in a sane range, and one
  188.         that naturally grows the more evidence there is to back up
  189.         a probability.
  190.         '''
  191.         spamcount = record.spamcount
  192.         hamcount = record.hamcount
  193.         
  194.         try:
  195.             return self.probcache[spamcount][hamcount]
  196.         except KeyError:
  197.             pass
  198.  
  199.         if not self.nham:
  200.             pass
  201.         nham = float(1)
  202.         if not self.nspam:
  203.             pass
  204.         nspam = float(1)
  205.         if not hamcount <= nham:
  206.             raise AssertionError, 'Token seen in more ham than ham trained.'
  207.         hamratio = hamcount / nham
  208.         if not spamcount <= nspam:
  209.             raise AssertionError, 'Token seen in more spam than spam trained.'
  210.         spamratio = spamcount / nspam
  211.         prob = spamratio / (hamratio + spamratio)
  212.         S = options[('Classifier', 'unknown_word_strength')]
  213.         StimesX = S * options[('Classifier', 'unknown_word_prob')]
  214.         n = hamcount + spamcount
  215.         prob = (StimesX + n * prob) / (S + n)
  216.         
  217.         try:
  218.             self.probcache[spamcount][hamcount] = prob
  219.         except KeyError:
  220.             self.probcache[spamcount] = {
  221.                 hamcount: prob }
  222.  
  223.         return prob
  224.  
  225.     
  226.     def _add_msg(self, wordstream, is_spam):
  227.         self.probcache = { }
  228.         for word in Set(wordstream):
  229.             record = self._wordinfoget(word)
  230.             if record is None:
  231.                 record = self.WordInfoClass()
  232.             
  233.             self._wordinfoset(word, record)
  234.         
  235.         self._post_training()
  236.  
  237.     
  238.     def _remove_msg(self, wordstream, is_spam):
  239.         self.probcache = { }
  240.         if is_spam:
  241.             if self.nspam <= 0:
  242.                 raise ValueError('spam count would go negative!')
  243.             
  244.             self.nspam -= 1
  245.         elif self.nham <= 0:
  246.             raise ValueError('non-spam count would go negative!')
  247.         
  248.         self.nham -= 1
  249.         for word in Set(wordstream):
  250.             record = self._wordinfoget(word)
  251.             if record is not None:
  252.                 if is_spam:
  253.                     if record.spamcount > 0:
  254.                         record.spamcount -= 1
  255.                     
  256.                 elif record.hamcount > 0:
  257.                     record.hamcount -= 1
  258.                 
  259.                 if 0 == 0:
  260.                     pass
  261.                 elif 0 == record.spamcount:
  262.                     self._wordinfodel(word)
  263.                 else:
  264.                     self._wordinfoset(word, record)
  265.             0 == record.spamcount
  266.         
  267.         self._post_training()
  268.  
  269.     
  270.     def _post_training(self):
  271.         '''This is called after training on a wordstream.  Subclasses might
  272.         want to ensure that their databases are in a consistent state at
  273.         this point.  Introduced to fix bug #797890.'''
  274.         pass
  275.  
  276.     
  277.     def _getclues(self, wordstream):
  278.         mindist = options[('Classifier', 'minimum_prob_strength')]
  279.         if options[('Classifier', 'use_bigrams')]:
  280.             raw = []
  281.             push = raw.append
  282.             pair = None
  283.             seen = {
  284.                 pair: 1 }
  285.             for i, token in enumerate(wordstream):
  286.                 if i:
  287.                     pair = 'bi:%s %s' % (last_token, token)
  288.                 
  289.                 last_token = token
  290.                 for clue, indices in ((token, (i,)), (pair, (i - 1, i))):
  291.                     if clue not in seen:
  292.                         seen[clue] = 1
  293.                         tup = self._worddistanceget(clue)
  294.                         if tup[0] >= mindist:
  295.                             push((tup, indices))
  296.                         
  297.                     tup[0] >= mindist
  298.                 
  299.             
  300.             raw.sort()
  301.             raw.reverse()
  302.             clues = []
  303.             push = clues.append
  304.             seen = { }
  305.             for tup, indices in raw:
  306.                 overlap = _[1]
  307.                 if not overlap:
  308.                     for i in indices:
  309.                         seen[i] = 1
  310.                     
  311.                     push(tup)
  312.                     continue
  313.                 []
  314.             
  315.             clues.reverse()
  316.         else:
  317.             clues = []
  318.             push = clues.append
  319.             for word in Set(wordstream):
  320.                 tup = self._worddistanceget(word)
  321.                 if tup[0] >= mindist:
  322.                     push(tup)
  323.                     continue
  324.             
  325.             clues.sort()
  326.         if len(clues) > options[('Classifier', 'max_discriminators')]:
  327.             del clues[0:-options[('Classifier', 'max_discriminators')]]
  328.         
  329.         return [ t[1:] for t in clues ]
  330.  
  331.     
  332.     def _worddistanceget(self, word):
  333.         record = self._wordinfoget(word)
  334.         if record is None:
  335.             prob = options[('Classifier', 'unknown_word_prob')]
  336.         else:
  337.             prob = self.probability(record)
  338.         distance = abs(prob - 0.5)
  339.         return (distance, prob, word, record)
  340.  
  341.     
  342.     def _wordinfoget(self, word):
  343.         return self.wordinfo.get(word)
  344.  
  345.     
  346.     def _wordinfoset(self, word, record):
  347.         self.wordinfo[word] = record
  348.  
  349.     
  350.     def _wordinfodel(self, word):
  351.         del self.wordinfo[word]
  352.  
  353.     
  354.     def _enhance_wordstream(self, wordstream):
  355.         '''Add bigrams to the wordstream.
  356.  
  357.         For example, a b c -> a b "a b" c "b c"
  358.  
  359.         Note that these are *token* bigrams, and not *word* bigrams - i.e.
  360.         \'synthetic\' tokens get bigram\'ed, too.
  361.  
  362.         The bigram token is simply "bi:unigram1 unigram2" - a space should
  363.         be sufficient as a separator, since spaces aren\'t in any other
  364.         tokens, apart from \'synthetic\' ones.  The "bi:" prefix is added
  365.         to avoid conflict with tokens we generate (like "subject: word",
  366.         which could be "word" in a subject, or a bigram of "subject:" and
  367.         "word").
  368.  
  369.         If the "Classifier":"use_bigrams" option is removed, this function
  370.         can be removed, too.
  371.         '''
  372.         last = None
  373.         for token in wordstream:
  374.             yield token
  375.             if last:
  376.                 yield 'bi:%s %s' % (last, token)
  377.             
  378.             last = token
  379.         
  380.  
  381.     
  382.     def _generate_slurp(self):
  383.         if not hasattr(self, 'setup_done'):
  384.             self.setup()
  385.             self.setup_done = True
  386.         
  387.         if not hasattr(self, 'do_slurp') or self.do_slurp:
  388.             if slurp_wordstream:
  389.                 self.do_slurp = False
  390.                 tokens = self.slurp(*slurp_wordstream)
  391.                 self.do_slurp = True
  392.                 self._save_caches()
  393.                 return tokens
  394.             
  395.         
  396.         return []
  397.  
  398.     
  399.     def setup(self):
  400.         ExpiryFileCorpus = ExpiryFileCorpus
  401.         FileMessageFactory = FileMessageFactory
  402.         import spambayes.FileCorpus
  403.         username = options[('globals', 'proxy_username')]
  404.         password = options[('globals', 'proxy_password')]
  405.         server = options[('globals', 'proxy_server')]
  406.         if server.find(':') != -1:
  407.             (server, port) = server.split(':', 1)
  408.         else:
  409.             port = 8080
  410.         if server:
  411.             proxy_support = urllib2.ProxyHandler({
  412.                 'http': 'http://%s:%s@%s:%d' % (username, password, server, port) })
  413.             opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
  414.         else:
  415.             opener = urllib2.build_opener(urllib2.HTTPHandler)
  416.         urllib2.install_opener(opener)
  417.         age = options[('URLRetriever', 'x-cache_expiry_days')] * 24 * 60 * 60
  418.         dir = options[('URLRetriever', 'x-cache_directory')]
  419.         if not os.path.exists(dir):
  420.             if options[('globals', 'verbose')]:
  421.                 print >>sys.stderr, 'Creating URL cache directory'
  422.             
  423.             os.makedirs(dir)
  424.         
  425.         self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(), dir, cacheSize = 20)
  426.         self.urlCorpus.removeExpiredMessages()
  427.         self.bad_url_cache_name = os.path.join(dir, 'bad_urls.pck')
  428.         self.http_error_cache_name = os.path.join(dir, 'http_error_urls.pck')
  429.         if os.path.exists(self.bad_url_cache_name):
  430.             b_file = file(self.bad_url_cache_name, 'r')
  431.             
  432.             try:
  433.                 self.bad_urls = pickle.load(b_file)
  434.             except IOError:
  435.                 ValueError = None
  436.                 if options[('globals', 'verbose')]:
  437.                     print >>sys.stderr, 'Bad URL pickle, using new.'
  438.                 
  439.                 self.bad_urls = {
  440.                     'url:non_resolving': (),
  441.                     'url:non_html': (),
  442.                     'url:unknown_error': () }
  443.  
  444.             b_file.close()
  445.         elif options[('globals', 'verbose')]:
  446.             print "URL caches don't exist: creating"
  447.         
  448.         self.bad_urls = {
  449.             'url:non_resolving': (),
  450.             'url:non_html': (),
  451.             'url:unknown_error': () }
  452.         if os.path.exists(self.http_error_cache_name):
  453.             h_file = file(self.http_error_cache_name, 'r')
  454.             
  455.             try:
  456.                 self.http_error_urls = pickle.load(h_file)
  457.             except IOError:
  458.                 ValueError = None
  459.                 if options[('globals', 'verbose')]:
  460.                     print >>sys.stderr, 'Bad HHTP error pickle, using new.'
  461.                 
  462.                 self.http_error_urls = { }
  463.  
  464.             h_file.close()
  465.         else:
  466.             self.http_error_urls = { }
  467.  
  468.     
  469.     def _save_caches(self):
  470.         for name, data in [
  471.             (self.bad_url_cache_name, self.bad_urls),
  472.             (self.http_error_cache_name, self.http_error_urls)]:
  473.             cache = open(name + '.tmp', 'w')
  474.             pickle.dump(data, cache)
  475.             cache.close()
  476.             
  477.             try:
  478.                 os.rename(name + '.tmp', name)
  479.             continue
  480.             except OSError:
  481.                 os.remove(name)
  482.                 os.rename(name + '.tmp', name)
  483.                 continue
  484.             
  485.  
  486.         
  487.  
  488.     
  489.     def slurp(self, proto, url):
  490.         if not url:
  491.             return [
  492.                 'url:non_resolving']
  493.         
  494.         Tokenizer = Tokenizer
  495.         import spambayes.tokenizer
  496.         if options[('URLRetriever', 'x-only_slurp_base')]:
  497.             url = self._base_url(url)
  498.         
  499.         for err in self.bad_urls.keys():
  500.             if url in self.bad_urls[err]:
  501.                 return [
  502.                     err]
  503.                 continue
  504.         
  505.         if self.http_error_urls.has_key(url):
  506.             return self.http_error_urls[url]
  507.         
  508.         mo = DOMAIN_AND_PORT_RE.match(url)
  509.         domain = mo.group(1)
  510.         if mo.group(3) is None:
  511.             port = 80
  512.         else:
  513.             port = mo.group(3)
  514.         
  515.         try:
  516.             not_used = socket.getaddrinfo(domain, port)
  517.         except socket.error:
  518.             self.bad_urls['url:non_resolving'] += (url,)
  519.             return [
  520.                 'url:non_resolving']
  521.  
  522.         url_key = URL_KEY_RE.sub('_', url)
  523.         cached_message = self.urlCorpus.get(url_key)
  524.         if cached_message is None:
  525.             parts = url.split('.')
  526.             if parts[-1] in ('jpg', 'gif', 'png', 'css', 'js'):
  527.                 self.bad_urls['url:non_html'] += (url,)
  528.                 return [
  529.                     'url:non_html']
  530.             
  531.             
  532.             try:
  533.                 timeout = socket.getdefaulttimeout()
  534.                 socket.setdefaulttimeout(5)
  535.             except AttributeError:
  536.                 pass
  537.  
  538.             
  539.             try:
  540.                 if options[('globals', 'verbose')]:
  541.                     print >>sys.stderr, 'Slurping', url
  542.                 
  543.                 f = urllib2.urlopen('%s://%s' % (proto, url))
  544.             except (urllib2.URLError, socket.error):
  545.                 details = None
  546.                 mo = HTTP_ERROR_RE.match(str(details))
  547.                 if mo:
  548.                     self.http_error_urls[url] = 'url:http_' + mo.group(1)
  549.                     return [
  550.                         'url:http_' + mo.group(1)]
  551.                 
  552.                 self.bad_urls['url:unknown_error'] += (url,)
  553.                 return [
  554.                     'url:unknown_error']
  555.  
  556.             
  557.             try:
  558.                 socket.setdefaulttimeout(timeout)
  559.             except AttributeError:
  560.                 pass
  561.  
  562.             
  563.             try:
  564.                 content_type = f.info().get('content-type')
  565.                 if content_type is None or not content_type.startswith('text/html'):
  566.                     self.bad_urls['url:non_html'] += (url,)
  567.                     return [
  568.                         'url:non_html']
  569.                 
  570.                 page = f.read()
  571.                 headers = str(f.info())
  572.                 f.close()
  573.             except socket.error:
  574.                 return []
  575.  
  576.             fake_message_string = headers + '\r\n' + page
  577.             message = self.urlCorpus.makeMessage(url_key, fake_message_string)
  578.             self.urlCorpus.addMessage(message)
  579.         else:
  580.             fake_message_string = cached_message.as_string()
  581.         msg = message_from_string(fake_message_string)
  582.         bht = options[('Tokenizer', 'basic_header_tokenize')]
  583.         bhto = options[('Tokenizer', 'basic_header_tokenize_only')]
  584.         options[('Tokenizer', 'basic_header_tokenize')] = True
  585.         options[('Tokenizer', 'basic_header_tokenize_only')] = True
  586.         tokens = Tokenizer().tokenize(msg)
  587.         pf = options[('URLRetriever', 'x-web_prefix')]
  588.         tokens = [ '%s%s' % (pf, tok) for tok in tokens ]
  589.         options[('Tokenizer', 'basic_header_tokenize')] = bht
  590.         options[('Tokenizer', 'basic_header_tokenize_only')] = bhto
  591.         return tokens
  592.  
  593.     
  594.     def _base_url(self, url):
  595.         url += '/'
  596.         (domain, garbage) = url.split('/', 1)
  597.         parts = domain.split('.')
  598.         if len(parts) > 2:
  599.             base_domain = parts[-2] + '.' + parts[-1]
  600.             if len(parts[-1]) < 3:
  601.                 base_domain = parts[-3] + '.' + base_domain
  602.             
  603.         else:
  604.             base_domain = domain
  605.         return base_domain
  606.  
  607.     
  608.     def _add_slurped(self, wordstream):
  609.         """Add tokens generated by 'slurping' (i.e. tokenizing
  610.         the text at the web pages pointed to by URLs in messages)
  611.         to the wordstream."""
  612.         for token in wordstream:
  613.             yield token
  614.         
  615.         slurped_tokens = self._generate_slurp()
  616.         for token in slurped_tokens:
  617.             yield token
  618.         
  619.  
  620.     
  621.     def _wordinfokeys(self):
  622.         return self.wordinfo.keys()
  623.  
  624.  
  625. Bayes = Classifier
  626.